# Computations
import pandas as pd
import numpy as np
# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
# keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils.vis_utils import plot_model
import keras.backend as K
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## seaborn
import seaborn as sns
sns.set_context('paper', rc={'font.size':12,'axes.titlesize':14,'axes.labelsize':12})
sns.set_style('white')
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
from plotly import tools
import plotly.express as px
import plotly.figure_factory as ff
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
In this article, we analyze the UCI Statlog (german credit data) from Kaggle.com.
The original dataset contains 1000 entries with 20 categorial/symbolic attributes prepared by Prof. Hofmann. In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of attributes. The link to the original dataset can be found below.
It is almost impossible to understand the original dataset due to its complicated system of categories and symbols. Thus, I wrote a small Python script to convert it into a readable CSV file. Several columns are simply ignored, because in my opinion either they are not important or their descriptions are obscure. The selected attributes are:
Data = pd.read_csv('Data/german_credit_data.csv', index_col=0)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
#
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'The Dataset:')
display(Data.head())
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Nan Values:')
display(Data_info(Data))
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Dataset Shape:')
display(pd.DataFrame([Data.shape], columns = ['Instances','Attributes'],index = ['Dataset']))
Data['Sex'] = Data['Sex'].map(lambda x: x.title())
Data['Housing'] = Data['Housing'].map(lambda x: x.title())
Data['Checking account'] = Data['Checking account'].fillna('None')
Data['Checking account'] = Data['Checking account'].map(lambda x: x.title())
Data['Saving accounts'] = Data['Saving accounts'].fillna('None')
Data['Saving accounts'] = Data['Saving accounts'].map(lambda x: x.title())
Data['Purpose'] = Data['Purpose'].map(lambda x: x.title())
Data['Purpose'] = Data['Purpose'].replace({'Radio/Tv':'Radio/TV'})
Data['Risk'] = Data['Risk'].map(lambda x: x.title())
Data.columns = [x.title() for x in Data.columns]
Data.head()
Creating new features:
We create statcan.gc.ca Age Category.
| Interval | Age Category |
|---|---|
| 00-14 years | Children |
| 15-24 years | Youth |
| 25-64 years | Adults |
| 65 years and over | Seniors |
if Data.Age.min() < 14:
bins = pd.IntervalIndex.from_tuples([(0, 14), (14, 24), (24, 64),(64, Data.Age.max())])
else:
bins = pd.IntervalIndex.from_tuples([(14, 24), (24, 64),(64, Data.Age.max())])
Data['Age Category'] = pd.cut(Data['Age'], bins).astype(str).\
replace({'(14, 24]':'Youth', '(24, 64]':'Adults','(64, 75]':'Seniors'})
Data.head()
Data_Type = Data_info(Data).iloc[:,:1]
display(Data_Type.T)
Temp = Data_Type.loc[Data_Type['Data Type'] == 'object'].index.tolist()
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Categorical Features:'+ Style.RESET_ALL + ' %s:'
% ', '.join(Temp))
We can convert Age Category, Checking Account, Housing, Risk, Saving Accounts, and Sex as follows \begin{align} &\mbox{Age Category} = \begin{cases} 0, \mbox{Youth},\\ 1, \mbox{Adults},\\ 2, \mbox{Seniors}. \end{cases},& &\mbox{Checking Account} = \begin{cases} 0, \mbox{None},\\ 1, \mbox{Little},\\ 2, \mbox{Moderate},\\ 3, \mbox{Rich}. \end{cases},& \\ &\mbox{Housing} = \begin{cases} 0, \mbox{Free},\\ 1, \mbox{Rent},\\ 2, \mbox{Own,} \end{cases},& &\mbox{Risk} = \begin{cases} 0, \mbox{Bad},\\ 1, \mbox{Good}. \end{cases},& \\ &\mbox{Saving Accounts} = \begin{cases} 0, \mbox{None},\\ 1, \mbox{Little},\\ 2, \mbox{Moderate},\\ 3, \mbox{Rich}. \end{cases},& &\mbox{Sex} = \begin{cases} 0, \mbox{Female},\\ 1, \mbox{Male}. \end{cases}.& \end{align}
Moreover, Purpose can be converted as a dummy variable.
df = Data.copy()
df = df.drop(columns = ['Age'])
df['Age Category'] = df['Age Category'].replace({'Youth':0, 'Adults': 1, 'Seniors':2}).astype(int)
df['Checking Account'] = df['Checking Account'].replace({'None':0, 'Little': 1, 'Moderate':2, 'Rich':3}).astype(int)
df['Housing'] = df['Housing'].replace({'Free':0, 'Rent': 1, 'Own':2}).astype(int)
df['Risk'] = df['Risk'].replace({'Bad':0, 'Good':1}).astype(int)
df['Saving Accounts'] = df['Saving Accounts'].replace({'None':0, 'Little': 1, 'Moderate':2,
'Rich':3, 'Quite Rich':4}).astype(int)
df['Sex'] = df['Sex'].replace({'Female':0, 'Male':1}).astype(int)
Temp = df.drop(columns = ['Purpose'])
df = pd.concat([Temp, pd.get_dummies(df['Purpose'])], axis = 1)
del Temp
Target = 'Risk'
X = df.drop(columns = [Target])
y = df[Target]
Now, let's take a look at the variance of the features.
display(X.var().sort_values(ascending = False).to_frame(name= 'Variance').T.style.set_precision(2))
As can see some of the variables have high variance and this is not desirable for our modeling. Thus, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
Temp = X.columns
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X, columns = Temp)
del Temp
Correlations of features with Class.
Temp = pd.DataFrame(X, columns = df.drop(columns = [Target]).columns)
Temp[Target] = y
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})
Correlation_Plot (Temp, 14)
y = pd.get_dummies(Data[Target]).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
model = Sequential()
model.add(Dense(12, input_dim= X.shape[1], init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='sigmoid'))
model.add(Dense(4, init='uniform', activation='sigmoid'))
model.add(Dense(y.shape[1], init='uniform', activation='sigmoid'))
# Number of iterations
IT = int(2e3)+1
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy','mae', 'mse'])
# Train model
history = model.fit(X_train, y_train, nb_epoch= IT, batch_size=50, verbose=0)
# Predications and Score
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
score = pd.DataFrame(score, index = model.metrics_names).T
history = pd.DataFrame(history.history)
display(score.style.hide_index())
fig = go.Figure()
fig.add_trace(go.Scatter(x= history.index.values, y= history['loss'].values, line=dict(color='OrangeRed', width= 1.5),
name = 'Loss'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['accuracy'].values, line=dict(color='MidnightBlue', width= 1.5),
name = 'Accuracy'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['mae'].values, line=dict(color='ForestGreen', width= 1.5),
name = 'Mean Absolute Error (MAE)'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['mse'].values, line=dict(color='purple', width= 1.5),
name = 'Mean Squared Error (MSE)'))
fig.update_layout(legend=dict(y=0.5, traceorder='reversed', font_size=12))
fig.update_layout(dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig['layout']['xaxis'].update(range=[0, history.index.values.max()])
fig['layout']['yaxis'].update(range=[0, 1.0])
fig.show()
Finally, a summary and a glimpse of the model.
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True)
Using the following code gives our model diagram
from ann_visualizer.visualize import ann_viz
from pdf2image import convert_from_path
ann_viz(model, filename = 'Model02',title="The Model");
for Img in convert_from_path('Model02.pdf'):
Img.save('Model02.jpg', 'JPEG')

Labels = y.columns.tolist()
# Train set
y_pred = model.predict(X_train)
Confusion_Matrix = confusion_matrix(y_train.values.argmax(axis=1), y_pred.argmax(axis=1))
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('Train Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels');
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)
Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels');
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)
# Test set
y_pred = model.predict(X_test)
Confusion_Matrix = confusion_matrix(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('Test Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels');
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)
Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels');
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)